In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import time
import re
import pandas as pd
import numpy as np
import yellowbrick
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from sklearn import metrics, preprocessing
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report, roc_curve, auc, roc_auc_score, silhouette_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn import decomposition
import scipy.stats as stats
from scipy.linalg import eigh
from math import factorial as f
from pylab import rcParams
rcParams['figure.figsize'] = 10, 15
%matplotlib inline
ONLY HERE FOR SCENARIO WITH OUTLER DETECTION AND IMPUTATION SCENARIO.
THIS IS NOT THE ACTUAL PROJECT. PLEASE REFER OTHER FILE FOR "PART A" AND FULL PROJECT
In [2]:
vehicle = pd.read_csv("C:/Users/pri96/OneDrive/Documents/AI and ML PGP/Module 5 - Unsupervised Learning (Week 17 to Week 19)/Project/vehicle.csv")
vehicle.head()
Out[2]:
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
In [3]:
print("There are", vehicle.shape[0], "rows and", vehicle.shape[1], "columns in the dataframe")
There are 846 rows and 19 columns in the dataframe
*SOLUTION (1 B.)*¶
In [4]:
vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
Based on the above information, we can infer below:
- All values are of numerical type, except class. We can use label encoding for this to convert it to numerical type feature however we won't be doing that right now and would see if it's required in future
- There are no null values for 5 features (class, hollow_ratio, max.length_rectangularity, max.length_aspect_ratio, compactness). rest all features have null values and require imputation. We'll impute them with their respective median values
In [5]:
# Check percentage of missing values in each column
missing_percentages = vehicle.isnull().mean() * 100
# Print missing percentages
print("Percentage of missing values in each column:")
print(missing_percentages)
vehicle.isnull().sum()
Percentage of missing values in each column: compactness 0.000000 circularity 0.591017 distance_circularity 0.472813 radius_ratio 0.709220 pr.axis_aspect_ratio 0.236407 max.length_aspect_ratio 0.000000 scatter_ratio 0.118203 elongatedness 0.118203 pr.axis_rectangularity 0.354610 max.length_rectangularity 0.000000 scaled_variance 0.354610 scaled_variance.1 0.236407 scaled_radius_of_gyration 0.236407 scaled_radius_of_gyration.1 0.472813 skewness_about 0.709220 skewness_about.1 0.118203 skewness_about.2 0.118203 hollows_ratio 0.000000 class 0.000000 dtype: float64
Out[5]:
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
In [6]:
columns_with_unexpected_values = []
for column in vehicle.columns:
unique_values = vehicle[column].unique()
unexpected_values = []
for value in unique_values:
if pd.isna(value): # Checking for NaN values
unexpected_values.append(value)
elif not pd.api.types.is_numeric_dtype(vehicle[column]) and not isinstance(value, str):
unexpected_values.append(value) # Checking for non-string non-numeric values, which is highly unlikely
if unexpected_values:
print(f"Column '{column}' has unexpected values: {unexpected_values}")
columns_with_unexpected_values.append(column)
# Checking for unexpected values across all datapoints (rows)
unexpected_rows = pd.DataFrame(vehicle[vehicle.isnull().any(axis = 1)])
if not unexpected_rows.empty:
print(f"\nAnd some of those unexpected values across below {len(unexpected_rows)} rows:\n\n")
else:
print("No unexpected values found across datapoints.")
unexpected_rows.head()
Column 'circularity' has unexpected values: [nan] Column 'distance_circularity' has unexpected values: [nan] Column 'radius_ratio' has unexpected values: [nan] Column 'pr.axis_aspect_ratio' has unexpected values: [nan] Column 'scatter_ratio' has unexpected values: [nan] Column 'elongatedness' has unexpected values: [nan] Column 'pr.axis_rectangularity' has unexpected values: [nan] Column 'scaled_variance' has unexpected values: [nan] Column 'scaled_variance.1' has unexpected values: [nan] Column 'scaled_radius_of_gyration' has unexpected values: [nan] Column 'scaled_radius_of_gyration.1' has unexpected values: [nan] Column 'skewness_about' has unexpected values: [nan] Column 'skewness_about.1' has unexpected values: [nan] Column 'skewness_about.2' has unexpected values: [nan] And some of those unexpected values across below 33 rows:
Out[6]:
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 107 | NaN | 106.0 | 172.0 | 50.0 | 6 | 255.0 | 26.0 | 28.0 | 169 | 280.0 | 957.0 | 264.0 | 85.0 | 5.0 | 9.0 | 181.0 | 183 | bus |
| 9 | 93 | 44.0 | 98.0 | NaN | 62.0 | 11 | 183.0 | 36.0 | 22.0 | 146 | 202.0 | 505.0 | 152.0 | 64.0 | 4.0 | 14.0 | 195.0 | 204 | car |
| 19 | 101 | 56.0 | 100.0 | 215.0 | NaN | 10 | 208.0 | 32.0 | 24.0 | 169 | 227.0 | 651.0 | 223.0 | 74.0 | 6.0 | 5.0 | 186.0 | 193 | car |
| 35 | 100 | 46.0 | NaN | 172.0 | 67.0 | 9 | 157.0 | 43.0 | 20.0 | 150 | 170.0 | 363.0 | 184.0 | 67.0 | 17.0 | 7.0 | 192.0 | 200 | van |
| 66 | 81 | 43.0 | 68.0 | 125.0 | 57.0 | 8 | 149.0 | 46.0 | 19.0 | 146 | 169.0 | 323.0 | 172.0 | NaN | NaN | 18.0 | 179.0 | 184 | bus |
In [7]:
# Imputing above instances of missing values with median
for column in columns_with_unexpected_values:
vehicle[column].fillna(vehicle.groupby('class')[column].transform('median'),inplace = True)
vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 846 non-null float64 2 distance_circularity 846 non-null float64 3 radius_ratio 846 non-null float64 4 pr.axis_aspect_ratio 846 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 846 non-null float64 7 elongatedness 846 non-null float64 8 pr.axis_rectangularity 846 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 846 non-null float64 11 scaled_variance.1 846 non-null float64 12 scaled_radius_of_gyration 846 non-null float64 13 scaled_radius_of_gyration.1 846 non-null float64 14 skewness_about 846 non-null float64 15 skewness_about.1 846 non-null float64 16 skewness_about.2 846 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
Now all columns have non null values. Let's check the 5 Point summary for the dataframe
In [8]:
vehicle.describe(include = 'all').T
Out[8]:
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | NaN | NaN | NaN | 93.678487 | 8.234474 | 73.0 | 87.0 | 93.0 | 100.0 | 119.0 |
| circularity | 846.0 | NaN | NaN | NaN | 44.826241 | 6.13434 | 33.0 | 40.0 | 44.0 | 49.0 | 59.0 |
| distance_circularity | 846.0 | NaN | NaN | NaN | 82.066194 | 15.754263 | 40.0 | 70.0 | 80.0 | 98.0 | 112.0 |
| radius_ratio | 846.0 | NaN | NaN | NaN | 168.916076 | 33.427561 | 104.0 | 141.0 | 167.25 | 195.0 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | NaN | NaN | NaN | 61.680851 | 7.882557 | 47.0 | 57.0 | 61.0 | 65.0 | 138.0 |
| max.length_aspect_ratio | 846.0 | NaN | NaN | NaN | 8.567376 | 4.601217 | 2.0 | 7.0 | 8.0 | 10.0 | 55.0 |
| scatter_ratio | 846.0 | NaN | NaN | NaN | 168.920804 | 33.199802 | 112.0 | 147.0 | 157.0 | 198.0 | 265.0 |
| elongatedness | 846.0 | NaN | NaN | NaN | 40.927896 | 7.813401 | 26.0 | 33.0 | 43.0 | 46.0 | 61.0 |
| pr.axis_rectangularity | 846.0 | NaN | NaN | NaN | 20.579196 | 2.590879 | 17.0 | 19.0 | 20.0 | 23.0 | 29.0 |
| max.length_rectangularity | 846.0 | NaN | NaN | NaN | 147.998818 | 14.515652 | 118.0 | 137.0 | 146.0 | 159.0 | 188.0 |
| scaled_variance | 846.0 | NaN | NaN | NaN | 188.643026 | 31.37802 | 130.0 | 167.0 | 179.0 | 217.0 | 320.0 |
| scaled_variance.1 | 846.0 | NaN | NaN | NaN | 439.665485 | 176.492876 | 184.0 | 318.25 | 364.0 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | NaN | NaN | NaN | 174.712766 | 32.546284 | 109.0 | 149.0 | 174.0 | 198.0 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | NaN | NaN | NaN | 72.443262 | 7.470873 | 59.0 | 67.0 | 71.0 | 75.0 | 135.0 |
| skewness_about | 846.0 | NaN | NaN | NaN | 6.356974 | 4.904073 | 0.0 | 2.0 | 6.0 | 9.0 | 22.0 |
| skewness_about.1 | 846.0 | NaN | NaN | NaN | 12.604019 | 8.930921 | 0.0 | 5.0 | 11.0 | 19.0 | 41.0 |
| skewness_about.2 | 846.0 | NaN | NaN | NaN | 188.919622 | 6.152167 | 176.0 | 184.0 | 188.0 | 193.0 | 206.0 |
| hollows_ratio | 846.0 | NaN | NaN | NaN | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.0 | 211.0 |
| class | 846 | 3 | car | 429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
- There are 846 instances with 19 attributes (columns) including both numerical (18) and categorical (1) features
- SOme attributes like radius_ratio, pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_variance.1, scaled_radius_of_gyration, and skewness_about.1 have relatively high standard deviations compared to their means, suggesting potential outliers or significant variability in the data
- The class attribute has 3 unique classes (car, van, bus) with car being the most frequent (429 instances). This suggests an imbalance where one class (car) dominates
- Compactness and circularity have mean and median values almost similar, which signifies that they both are normally distributed and have no skewness/outlier
- We can have further insights with various EDA
*SOLUTION (1 C.)*¶
In [9]:
# Count the occurrences of each class
class_counts = vehicle['class'].value_counts()
# Plotting a pie chart
plt.figure(figsize = (8, 6))
plt.pie(class_counts, labels = class_counts.index, autopct = '%1.1f%%', startangle = 140)
plt.title('Distribution of Classes')
plt.show()
# Print percentage of values for each class
print("Percentage of values for variable 'class':")
print(class_counts / len(vehicle) * 100)
Percentage of values for variable 'class': class car 50.709220 bus 25.768322 van 23.522459 Name: count, dtype: float64
Based on above pie-chart, we see that:
- Appproximately 50.7% of the vehicles in the dataset are classified as cars. The percetage division of buses and vans are ~25.8% and ~23.5% respectively
- The dataset is slightly imbalanced towards cars, which constitute more than half of the vehicles. Buses and vans make up the remaining portion, with buses being slightly more frequent than vans
We can also say that the models trained on this dataframe may be biased towards predicting 'car' instances more accurately due to their higher representation in the dataset
*SOLUTION (1 D.)*¶
In [10]:
duplicate_rows = vehicle[vehicle.duplicated()]
if not duplicate_rows.empty:
print(f"Number of duplicate rows: {len(duplicate_rows)}")
print("Duplicate rows:")
print(duplicate_rows)
else:
print("No duplicate rows found.")
No duplicate rows found.
There are no duplictae rows so no further steps required for impution/correctness
Before proceeding to next parts, let's have some analysis on the given dataset
PAIR PLOT¶
In [11]:
sns.pairplot(vehicle, diag_kind = 'kde', hue = 'class')
Out[11]:
<seaborn.axisgrid.PairGrid at 0x2e37a01a1d0>